{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# LLM regression testing workflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "axHrgN51KfN4",
    "outputId": "b5e199e9-d761-45c8-aa66-6a7145118830"
   },
   "outputs": [],
   "source": [
    "#!pip install evidently[llm]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Ut74KYEtNH_g"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import requests\n",
    "from datetime import datetime, timedelta\n",
    "from io import BytesIO"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "AoyQoT6IK2Vi"
   },
   "source": [
    "To run open-source evaluations:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "A6mxNFYwKheS"
   },
   "outputs": [],
   "source": [
    "from evidently import ColumnMapping\n",
    "from evidently.report import Report\n",
    "from evidently.test_suite import TestSuite\n",
    "from evidently.metric_preset import TextEvals\n",
    "from evidently.descriptors import *\n",
    "from evidently.metrics import *\n",
    "from evidently.tests import *\n",
    "from evidently.features.llm_judge import BinaryClassificationPromptTemplate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "UqY4ChvuK31w"
   },
   "source": [
    "**Optional**: To work with Evidently Cloud:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "N0zbCWSRKkAu"
   },
   "outputs": [],
   "source": [
    "from evidently.ui.workspace.cloud import CloudWorkspace"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "hqFYYEcJKskb"
   },
   "source": [
    "**Optional**: To manage dashboards as code remotely. You can also do this in UI."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "pWIKauUUKtsV"
   },
   "outputs": [],
   "source": [
    "from evidently.ui.dashboards import DashboardPanelTestSuite\n",
    "from evidently.ui.dashboards import PanelValue\n",
    "from evidently.ui.dashboards import ReportFilter\n",
    "from evidently.ui.dashboards import TestFilter\n",
    "from evidently.ui.dashboards import TestSuitePanelType\n",
    "from evidently.renderers.html_widgets import WidgetSize"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "9R9KIquQKneA"
   },
   "source": [
    "# Prepare a dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wBcMGqRgOl-H"
   },
   "source": [
    "Get an example dataset. You can also download and import the CSV file directly ([Link](https://github.com/evidentlyai/evidently/blob/main/examples/how_to_questions/chat_df.csv))."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "6tIeNi7oM9tB"
   },
   "outputs": [],
   "source": [
    "response = requests.get(\"https://raw.githubusercontent.com/evidentlyai/evidently/main/examples/how_to_questions/chat_df.csv\")\n",
    "csv_content = BytesIO(response.content)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Q4ILCYdtQU-T"
   },
   "source": [
    "Read the CSV content into a DataFrame. Parse dates and set conversation \"start_time\" as index."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "S5Q3SwDQO-Uo"
   },
   "outputs": [],
   "source": [
    "assistant_logs = pd.read_csv(csv_content, index_col=0, parse_dates=['start_time', 'end_time'])\n",
    "assistant_logs.index = assistant_logs.start_time\n",
    "assistant_logs.index.rename('index', inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1CSJtC-2Qcw8"
   },
   "source": [
    "Preview:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ELXNX5IgGGtH"
   },
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 591
    },
    "id": "4RqJYuB-QeBt",
    "outputId": "ea6be72c-7bf5-4f4b-a7af-026b8ba43b6f"
   },
   "outputs": [],
   "source": [
    "assistant_logs.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "MSp-YVR3QrUj"
   },
   "source": [
    "# Set up your Workspace "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ZaTwnyeBSP91"
   },
   "source": [
    "To store the evaluation results, share them with others, and get a live monitoring dashboard, create your free Evidently Cloud account at: http://app.evidently.cloud/signup."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "KIcgHAYcRGt2"
   },
   "source": [
    " To connect to the workspace from your Python environment:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "gRPb4PRCQyXu"
   },
   "outputs": [],
   "source": [
    "ws = CloudWorkspace(token=\"YOUR_TOKEN\", \n",
    "                    url=\"https://app.evidently.cloud\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "c0XvsN9KRaxW"
   },
   "source": [
    "Create your Project:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 321
    },
    "id": "NeGfuYkGRaIX",
    "outputId": "81b8746b-72b0-4ddd-80f3-d80783bd1851"
   },
   "outputs": [],
   "source": [
    "project = ws.create_project(\"My project title\", team_id=\"YOUR_TEAM_ID\")\n",
    "project.description = \"My project description\"\n",
    "project.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ffgd1g26Rj2r"
   },
   "source": [
    "# Run evaluations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "fZXVDkPWRnO1"
   },
   "source": [
    "Prep: map your input data columns. Optional, but recommended."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "3RuDVi6gRml5"
   },
   "outputs": [],
   "source": [
    "column_mapping = ColumnMapping(\n",
    "    datetime='start_time',\n",
    "    datetime_features=['end_time'],\n",
    "    text_features=['question', 'response'],\n",
    "    categorical_features=['organization', 'model_ID', 'region', 'environment', 'feedback'],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "zvqFaPKQS0Ws"
   },
   "source": [
    "## Basic example"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "gYfKvl7RRwzb"
   },
   "source": [
    "Run the first evaluation by checking the chatbot response length. You will use the `TextLength()` descriptor. This will return an absolute count for the number of symbols in each text. You can also check `SentenceCount()`, `WordCount()`, etc."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "2pnbi0MfVTqV"
   },
   "source": [
    "To run the evaluation for the first 100 conversations and get a summary Report:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "cjrh_OaeR6kR"
   },
   "outputs": [],
   "source": [
    "text_evals_report = Report(metrics=[\n",
    "    TextEvals(column_name=\"response\",\n",
    "              descriptors=[\n",
    "                  TextLength(),\n",
    "                  ]\n",
    "              )\n",
    "])\n",
    "\n",
    "text_evals_report.run(reference_data=None,\n",
    "                      current_data=assistant_logs[:100],\n",
    "                      column_mapping=column_mapping)\n",
    "text_evals_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "A75jo9Ym1Mfh"
   },
   "source": [
    "You can also do a side-by-side comparison for two datasets:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "4nlqQBzd1Hsw"
   },
   "outputs": [],
   "source": [
    "text_evals_report = Report(metrics=[\n",
    "    TextEvals(column_name=\"response\",\n",
    "              descriptors=[\n",
    "                  TextLength(),\n",
    "                  ]\n",
    "              )\n",
    "])\n",
    "\n",
    "text_evals_report.run(reference_data=assistant_logs[:50],\n",
    "                      current_data=assistant_logs[50:100],\n",
    "                      column_mapping=column_mapping)\n",
    "text_evals_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "G7Wz3dg5W4zt"
   },
   "source": [
    "Let's look at other evaluation methods one by one. You can later combine multiple descriptors in a single Report."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "bXm3HDInVbPW"
   },
   "source": [
    "## Text patterns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "vel5lEy4dwxT"
   },
   "source": [
    "You can use regular expressions to check text patterns. For example, check the presence of competitor mentions, topical words, etc.\n",
    "\n",
    "Let's check for responses that contain words related to compensation. This will automatically account for inflected and variant words. This descriptor returns True/False for **pattern match**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Io-DIefjdzh9"
   },
   "outputs": [],
   "source": [
    "text_evals_report = Report(metrics=[\n",
    "    TextEvals(column_name=\"response\",\n",
    "              descriptors=[\n",
    "                  IncludesWords(\n",
    "                      words_list=['salary', 'benefits', 'payroll'],\n",
    "                      display_name=\"Mention Compensation\")\n",
    "            ]\n",
    "        ),\n",
    "        ]\n",
    ")\n",
    "\n",
    "text_evals_report.run(reference_data=None,\n",
    "                      current_data=assistant_logs[:100],\n",
    "                      column_mapping=column_mapping)\n",
    "text_evals_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "MmSKpUmDd4f1"
   },
   "source": [
    "Other examples: `Contains(items=[])`, `BeginsWith(prefix=\"\")`, custom `RegEx(reg_exp=r\"\")`, etc."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "AvpestN4Wcl5"
   },
   "source": [
    "## Model-based scoring"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "84OXslVeW-JA"
   },
   "source": [
    "You can use pre-trained machine learning models to score your text data.\n",
    "\n",
    "**Sentiment**. You can use built-in models like `Sentiment()`. This will return a sentiment score from -1 (very negative) to 1 (very positive)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "cqBdi0X_lE2B"
   },
   "outputs": [],
   "source": [
    "text_evals_report = Report(metrics=[\n",
    "    TextEvals(column_name=\"response\", descriptors=[\n",
    "            Sentiment(),\n",
    "        ]\n",
    "    ),\n",
    "])\n",
    "\n",
    "text_evals_report.run(reference_data=None,\n",
    "                      current_data=assistant_logs[:100],\n",
    "                      column_mapping=column_mapping)\n",
    "text_evals_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "emS5aCa1mbDN"
   },
   "source": [
    "You can also use models from HuggingFace. This will download the models to score your data locally.\n",
    "\n",
    "**Toxicity**. You can use a pre-selected toxicity model using `HuggingFaceToxicityModel()` descriptor. This will returns the predicted toxicity score between 0 to 1.\n",
    "\n",
    "**Neutral emotion**. You can call a named custom model from HuggingFace. For example, let's use the `SamLowe/roberta-base-go_emotions` model and get a score from 0 to 1 for \"neutral\" label to see if responses convey neutral emotion."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "XI40-HNDoP4T"
   },
   "outputs": [],
   "source": [
    "text_evals_report = Report(metrics=[\n",
    "    TextEvals(column_name=\"response\", descriptors=[\n",
    "            HuggingFaceToxicityModel(),\n",
    "            HuggingFaceModel(\n",
    "                model=\"SamLowe/roberta-base-go_emotions\",\n",
    "                params={\"label\": \"neutral\"},\n",
    "                display_name=\"Response Neutrality\"),\n",
    "        ]\n",
    "    ),\n",
    "])\n",
    "\n",
    "text_evals_report.run(reference_data=None,\n",
    "                      current_data=assistant_logs[:100],\n",
    "                      column_mapping=column_mapping)\n",
    "text_evals_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1ICoFdUokaN8"
   },
   "source": [
    "See docs on using HuggingFace models as descriptors: https://docs.evidentlyai.com/user-guide/customization/huggingface_descriptor"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "oQtaHaMnWhyr"
   },
   "source": [
    "## LLM-as-a-judge"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "x0UoZNfdqaup"
   },
   "source": [
    "This descriptor requires an OpenAI key. Pass it as an environment variable."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "JTbLQee4sTAd"
   },
   "outputs": [],
   "source": [
    "## import os\n",
    "\n",
    "## os.environ[\"OPENAI_API_KEY\"] = \"YOUR KEY\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "report = Report(metrics=[\n",
    "\n",
    "    TextEvals(column_name=\"response\", descriptors=[\n",
    "        DeclineLLMEval(),\n",
    "        PIILLMEval(include_reasoning=False), \n",
    "    ])\n",
    "])\n",
    "\n",
    "report.run(reference_data= None,\n",
    "           current_data= assistant_logs[:10],\n",
    "           column_mapping=column_mapping)\n",
    "report "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "custom_judge = LLMEval(\n",
    "    subcolumn=\"category\",\n",
    "    template = BinaryClassificationPromptTemplate(      \n",
    "        criteria = \"\"\"Conciseness refers to the quality of being brief and to the point, while still providing all necessary information.\n",
    "            A concise response should:\n",
    "            - Provide the necessary information without unnecessary details or repetition.\n",
    "            - Be brief yet comprehensive enough to address the query.\n",
    "            - Use simple and direct language to convey the message effectively.\n",
    "        \"\"\",\n",
    "        target_category=\"concise\",\n",
    "        non_target_category=\"verbose\",\n",
    "        uncertainty=\"unknown\",\n",
    "        include_reasoning=True,\n",
    "        pre_messages=[(\"system\", \"You are a judge which evaluates text.\")],\n",
    "        ),\n",
    "    provider = \"openai\",\n",
    "    model = \"gpt-4o-mini\"\n",
    ")\n",
    "\n",
    "report = Report(metrics=[\n",
    "    TextEvals(column_name=\"response\", descriptors=[\n",
    "        custom_judge\n",
    "    ])\n",
    "])\n",
    "\n",
    "report.run(reference_data= None,\n",
    "           current_data= assistant_logs[:10],\n",
    "           column_mapping=column_mapping)\n",
    "report "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ejPGUNARtVGq"
   },
   "source": [
    "You can also run evals using two columns: e.g., context and response, request and response. Check the docs: https://docs.evidentlyai.com/user-guide/customization/llm_as_a_judge"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "owWCjBuzzbhA"
   },
   "source": [
    "## Metadata columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "9sb5G5kFEagQ"
   },
   "source": [
    "To summarize metadata fields:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "_cOfewPbzbMG"
   },
   "outputs": [],
   "source": [
    "data_report = Report(metrics=[\n",
    "   ColumnSummaryMetric(column_name=\"feedback\"),\n",
    "   ]\n",
    ")\n",
    "\n",
    "data_report.run(reference_data=None, current_data=assistant_logs[:100], column_mapping=column_mapping)\n",
    "data_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "87EmSBMXWlzX"
   },
   "source": [
    "## Semantic similarity"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "coPM6wQls4uD"
   },
   "source": [
    "**Semantic similarity** evaluates how close two texts are in meaning using an embedding model. It returns a score from 0 to 1 (0: different, 0.5: unrelated, 1: similar). This is a pairwise descriptor. Let's compare the similarity between questions and responses:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "DhfkC8ZPtyay"
   },
   "outputs": [],
   "source": [
    "text_evals_report = Report(metrics=[\n",
    "    ColumnSummaryMetric(\n",
    "        column_name=SemanticSimilarity(\n",
    "            display_name=\"Response-Question Similarity\"\n",
    "        )\n",
    "        .on([\"response\", \"question\"])\n",
    "    )\n",
    "])\n",
    "\n",
    "text_evals_report.run(reference_data=None,\n",
    "                      current_data=assistant_logs[:100],\n",
    "                      column_mapping=column_mapping)\n",
    "text_evals_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "O0x9Xv2gtS03"
   },
   "source": [
    "You can also run evals to compare current responses against \"golden\" reference examples. Check a tutorial on regression testing: https://www.evidentlyai.com/blog/llm-regression-testing-tutorial"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Tlkd3ZtkuyHT"
   },
   "source": [
    "# Export evaluation results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "MpqX0NHbu0ov"
   },
   "source": [
    "View the dataset with added evaluation results for each row:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "xHiMnI-mu4op",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "text_evals_report.datasets().current"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "e_-v_itHvQ1F"
   },
   "source": [
    "Python dictionary with a summary report:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "C8StrCHQSf1c"
   },
   "outputs": [],
   "source": [
    "text_evals_report.as_dict()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "8OArmH2YSmX9"
   },
   "source": [
    "JSON with a summary report:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "NbFanRgrSgze"
   },
   "outputs": [],
   "source": [
    "text_evals_report.json()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "uEc0d1R3SnLf"
   },
   "source": [
    "Save as HTML file:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "3xVfprKGSh9y"
   },
   "outputs": [],
   "source": [
    "text_evals_report.save_html(\"report.html\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "7uBFGtOzveR4"
   },
   "source": [
    "# Monitoring results over time"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "nRZtzZJLvhSj"
   },
   "source": [
    "Define a combined Report:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "PpS-95lxvoTQ"
   },
   "outputs": [],
   "source": [
    "text_evals_report = Report(metrics=[\n",
    "    TextEvals(column_name=\"response\", descriptors=[\n",
    "            Sentiment(),\n",
    "            TextLength(),\n",
    "            IncludesWords(words_list=['salary', 'benefits', 'payroll'],\n",
    "                          display_name=\"Mention Compensation\")\n",
    "\n",
    "        ],\n",
    "    ),\n",
    "    ColumnSummaryMetric(column_name=\"feedback\"),\n",
    "    ColumnSummaryMetric(column_name=\"region\"),\n",
    "    ColumnSummaryMetric(column_name=\"organization\"),\n",
    "    ColumnSummaryMetric(column_name=\"model_ID\"),\n",
    "    ColumnSummaryMetric(column_name=\"environment\"),\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "QNpwynGgwGuq"
   },
   "source": [
    "Run the Report on the first 50 rows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "-UFIZT29wGYK"
   },
   "outputs": [],
   "source": [
    "text_evals_report.run(reference_data=None,\n",
    "                      current_data=assistant_logs[:50],\n",
    "                      column_mapping=column_mapping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "project.id"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "cYTga8n3StRy"
   },
   "source": [
    "Send evaluation results to Evidently Cloud."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "l2kTNd4ySk7O"
   },
   "outputs": [],
   "source": [
    "ws.add_report(project.id, text_evals_report, include_data=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WsqZCPr70NCH"
   },
   "source": [
    "**Cloud UI**. In the Evidently Cloud UI, Add a \"Descriptors\" tab and a \"Columns\" tab to create a dashboard that plots the metrics."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Eyd4uD1dwjOg"
   },
   "source": [
    "Let's imitate a few consecutive runs to evaluate batches of data as they come. Run and send several Reports, each time taking 50 rows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "OStg94Oewisv"
   },
   "outputs": [],
   "source": [
    "text_evals_report.run(reference_data=None,\n",
    "                      current_data=assistant_logs[50:100],\n",
    "                      column_mapping=column_mapping)\n",
    "ws.add_report(project.id, text_evals_report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "4qTmQw9FxE0I"
   },
   "outputs": [],
   "source": [
    "text_evals_report.run(reference_data=None,\n",
    "                      current_data=assistant_logs[100:150],\n",
    "                      column_mapping=column_mapping)\n",
    "ws.add_report(project.id, text_evals_report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "4nyRgqF0xMdb"
   },
   "outputs": [],
   "source": [
    "text_evals_report.run(reference_data=None,\n",
    "                      current_data=assistant_logs[150:200],\n",
    "                      column_mapping=column_mapping)\n",
    "ws.add_report(project.id, text_evals_report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "H-joJ1J5yLx3"
   },
   "outputs": [],
   "source": [
    "text_evals_report.run(reference_data=None,\n",
    "                      current_data=assistant_logs[200:250],\n",
    "                      column_mapping=column_mapping)\n",
    "ws.add_report(project.id, text_evals_report)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_iPVTS7_1kZf"
   },
   "source": [
    "# Extra: Run conditional tests"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "0dC_0MOC19-g"
   },
   "source": [
    "You can monitor not only values but whether they comply with the conditions you define. For example:\n",
    "* Average response sentiment should be positive.\n",
    "* Response length should always be non-zero.\n",
    "* The maximum response length should be 2000 symbols.\n",
    "* The mean response length should be above 500 symbols."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Jp7i_7LPA3O5"
   },
   "source": [
    "Define the test suite:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "dYgjvUU--vgo"
   },
   "outputs": [],
   "source": [
    "test_suite = TestSuite(tests=[\n",
    "    TestColumnValueMean(column_name = Sentiment().on(\"response\"), gte=0),\n",
    "    TestColumnValueMin(column_name = TextLength().on(\"response\"), gt=0),\n",
    "    TestColumnValueMax(column_name = TextLength().on(\"response\"), lte=2000),\n",
    "    TestColumnValueMean(column_name = TextLength().on(\"response\"), gt=500),\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "h4WanpX4BdUW"
   },
   "source": [
    "Imitate sending 5 reports in a row with 1 hour difference in timestamps:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Fl3ix4epA2yQ"
   },
   "outputs": [],
   "source": [
    "for i in range(5):\n",
    "    test_suite.run(\n",
    "        reference_data=None,\n",
    "        current_data=assistant_logs.iloc[50 * i : 50 * (i + 1), :],\n",
    "        column_mapping=column_mapping,\n",
    "        timestamp=datetime.now() + timedelta(hours=i)\n",
    "    )\n",
    "    ws.add_test_suite(project.id, test_suite)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "kjua6VurD3H3"
   },
   "source": [
    "**Add a test suite panel**. Copy your Project ID to connect."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Ncr0A6LED-ap"
   },
   "outputs": [],
   "source": [
    "#project = ws.get_project(\"YOUR_PROJECT_ID\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Me27PIbpD9Lg"
   },
   "outputs": [],
   "source": [
    "project.dashboard.add_panel(\n",
    "    DashboardPanelTestSuite(\n",
    "        title=\"Test results\",\n",
    "        filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=True),\n",
    "        size=WidgetSize.FULL,\n",
    "        panel_type=TestSuitePanelType.DETAILED,\n",
    "    ),\n",
    "    tab=\"Tests\"\n",
    ")\n",
    "project.save()"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
